Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
debakarr
GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 7 - Natural Language Processing/[R] Natural Language Processing.ipynb
1002 views
Kernel: R

Natural Language Processing

Data Preprocessing

# Importing the dataset dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
head(dataset_original, 10)
dim(dataset_original)

Cleaning the texts

# install.packages('tm') library(tm) corpus = VCorpus(VectorSource(dataset_original$Review)) # Lowercase each word corpus = tm_map(corpus, content_transformer(tolower))
Loading required package: NLP
dataset_original$Review[1]
as.character(corpus[[1]])
# Removing all the numbers corpus = tm_map(corpus, removeNumbers)
dataset_original$Review[29]
as.character(corpus[[29]])
# Removing all the Punctuation corpus = tm_map(corpus, removePunctuation)
dataset_original$Review[1]
as.character(corpus[[1]])
# Removing stopwords eg. 'the', 'a', 'an', 'in', 'on' i.e all the preposition and articles corpus = tm_map(corpus, removeWords, stopwords())
dataset_original$Review[1]
as.character(corpus[[1]])
# Stemming # install.packages('SnowballC') corpus = tm_map(corpus, stemDocument)
dataset_original$Review[1]
as.character(corpus[[1]])
# Removing white space if any # corpus = tm_map(corpus, stripWhitespace)

Creating the Bag of Words model

dtm = DocumentTermMatrix(corpus)
dim(dtm)
dtm
<<DocumentTermMatrix (documents: 1000, terms: 1577)>> Non-/sparse entries: 5435/1571565 Sparsity : 100% Maximal term length: 32 Weighting : term frequency (tf)
# Filter words that are not frequent dtm = removeSparseTerms(dtm, 0.999) # Checking column for most 1
dtm
<<DocumentTermMatrix (documents: 1000, terms: 691)>> Non-/sparse entries: 4549/686451 Sparsity : 99% Maximal term length: 12 Weighting : term frequency (tf)
dataset = as.data.frame(as.matrix(dtm)) dataset$Liked = dataset_original$Liked
# Encoding the target feature as factor dataset$Liked = factor(dataset$Liked, levels = c(0, 1)) # Splitting the dataset into the Training set and Test set # install.packages('caTools') library(caTools) set.seed(1234) split = sample.split(dataset$Liked, SplitRatio = 0.80) training_set = subset(dataset, split == TRUE) test_set = subset(dataset, split == FALSE) # Fitting Naive Bayes to the Training set # install.packages('e1071') library(e1071) classifier = naiveBayes(x = training_set[-692], y = training_set$Liked) # Predicting the Test set results y_pred = predict(classifier, newdata = test_set[-692]) # Making the Confusion Matrix cm = table(test_set[, 692], y_pred)
cm
y_pred 0 1 0 9 91 1 7 93
# Encoding the target feature as factor dataset$Liked = factor(dataset$Liked, levels = c(0, 1)) # Splitting the dataset into the Training set and Test set # install.packages('caTools') library(caTools) set.seed(1234) split = sample.split(dataset$Liked, SplitRatio = 0.80) training_set = subset(dataset, split == TRUE) test_set = subset(dataset, split == FALSE) # Fitting Random Forest to the Training set # install.packages('randomForest') library(randomForest) classifier = randomForest(x = training_set[-692], y = training_set$Liked, ntree = 10) # Predicting the Test set results y_pred = predict(classifier, newdata = test_set[-692]) # Making the Confusion Matrix cm = table(test_set[, 692], y_pred)
randomForest 4.6-12 Type rfNews() to see new features/changes/bug fixes.
cm
y_pred 0 1 0 76 24 1 28 72